from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
 # 下载用于分词的nltk数据
nltk.download("punkt")

#这里是语料库
corpus = [
    'I like to eat apples',
    'Apples are tasty',
    'I enjoy eating bananas'
]

#分词
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

#训练Word2Vec模型
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

#查找与给定词最相似的词汇
similar_words = model.wv.most_similar("apples")
print("Words similar to 'apples':", similar_words)

# 计算两个词之间的相似性
model.wv.similarity('apples','bananas')

# 找不和其他匹配的项
model.wv.doesnt_match(['apples', 'bananas', 'enjoy'])


# 输出word向量
print(model.wv['enjoy'])

# 获取词汇表中的所有词汇
vocabulary = list(model.wv.index_to_key)
print("Vocabulary:", vocabulary)

# 获取给定词汇的词向量表示
word_vector = model.wv['apples']
print("Vector representation of 'apples':", word_vector)
